#ifdef __sw_slave__
#define _SIMD_SLAVE 1
#endif
#include <cassert>
#include "swarch.h"
#include "cell.h"

#define NPE_PACK 32
typedef struct {
  long offset[NPE_PACK + 1];
  long total_size;
  cellgrid_t *grid;
  char *buf;
  int xlo, xhi, ylo, yhi, zlo, zhi;

} packunpack_param_t;
typedef struct {
  int natom[NPE_PACK + 1], ncell[NPE_PACK + 1], nodd[NPE_PACK + 1];
} pack_index_t;
struct Divisor {
  long mul, shift, rval;
  __always_inline Divisor(int val) {
    int k = 0;
    while ((1L << (k + 1)) < val)
      k++;
    long m = ((1L << k + 32) + val - 1) / val;
    mul = m;
    shift = k + 32;
    rval = val;
  }
};
__always_inline int operator/(const int &a, const Divisor &b) {
  return a * b.mul >> b.shift;
}
__always_inline int operator%(const int &a, const Divisor &b) {
  return a - b.rval * (a / b);
}

#ifdef __sw_host__
#include <qthread.h>
#include <unistd.h>
extern void slave_cell_export_cpe(cellgrid_t *);
void cell_export_sw(cellgrid_t *grid) {
  // sleep(1);
  qthread_spawn(slave_cell_export_cpe, grid);
  qthread_join();
}
extern void slave_cell_import_cpe(cellgrid_t *);
void cell_import_sw(cellgrid_t *grid) {
  qthread_spawn(slave_cell_import_cpe, grid);
  qthread_join();
}
__always_inline size_t get_size_pack_export(char *buf, celldata_t *cell) {
  return cell->nbytes_export;
}
__always_inline size_t get_size_unpack_export(char *buf, celldata_t *cell) {
  long ret = *(int *)((char *)buf + 8);
  return ret;
}
__always_inline size_t get_size_most(int natom) {
  size_t ret = 8;
  ret += sizeof(long) * natom; //tag
  ret = ret + 7L & ~7L;
  ret += sizeof(vec<real>) * natom; //x
  ret = ret + 7L & ~7L;
  ret += sizeof(real) * natom; //q
  ret = ret + 7L & ~7L;
  ret += sizeof(int) * natom; //t
  ret = ret + 7L & ~7L;
  ret += sizeof(real) * natom; //mass
  ret = ret + 7L & ~7L;
  ret += sizeof(real) * natom; //rmass
  ret = ret + 7L & ~7L;
  return ret;
}
__always_inline size_t get_size_pack_most(char *buf, celldata_t *cell) {
  return get_size_most(cell->natom);
}

__always_inline size_t get_size_unpack_most(char *buf, celldata_t *cell) {
  int natom = *(int *)buf;
  return get_size_most(natom);
}
__always_inline size_t get_size_vec(char *buf, celldata_t *cell) {
  return cell->natom * sizeof(real) * 3;
}
template <size_t (*get_size)(char *buf, celldata_t *cell)>
size_t build_pack_params_sw(packunpack_param_t *pm, char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  int xlen = xhi - xlo;
  int ylen = yhi - ylo;
  int zlen = zhi - zlo;

  pm->xlo = xlo;
  pm->ylo = ylo;
  pm->zlo = zlo;
  pm->xhi = xhi;
  pm->yhi = yhi;
  pm->zhi = zhi;
  pm->grid = grid;
  pm->buf = buf;

  int ntask_pe = (xlen * ylen * zlen + NPE_PACK - 1) / NPE_PACK;
  Divisor ntasks_div(ntask_pe);
  size_t offset = 0;
  int last_pe = 0;

  for (int i = xlo; i < xhi; i++) {
    for (int j = ylo; j < yhi; j++) {
      for (int k = zlo; k < zhi; k++) {
        int idx = ((i - xlo) * ylen + j - ylo) * zlen + k - zlo;
        int div = idx / ntasks_div;
        int mod = idx % ntasks_div;
        if (mod == 0) {
          pm->offset[div] = offset;
          last_pe = div;
        }
        size_t size = get_size(((char *)buf + offset), get_cell_xyz(grid, i, j, k));
        offset += size;
      }
    }
  }
  pm->offset[last_pe + 1] = offset;
  pm->total_size = offset;
}

extern void slave_unpack_export_cpe(packunpack_param_t *pm);
extern void slave_pack_export_cpe(packunpack_param_t *pm);
extern void slave_unpack_most_cpe(packunpack_param_t *pm);
extern void slave_pack_most_cpe(packunpack_param_t *pm);
size_t pack_brick_export_sw(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  // assert(0);
  packunpack_param_t pm;
  build_pack_params_sw<get_size_pack_export>(&pm, buf, grid, xlo, xhi, ylo, yhi, zlo, zhi);
  qthread_spawn(slave_pack_export_cpe, &pm);
  qthread_join();

  return pm.total_size;
}
size_t unpack_brick_export_sw(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  // assert(0);
  packunpack_param_t pm;
  build_pack_params_sw<get_size_unpack_export>(&pm, buf, grid, xlo, xhi, ylo, yhi, zlo, zhi);
  qthread_spawn(slave_unpack_export_cpe, &pm);
  qthread_join();

  return pm.total_size;
}
size_t pack_brick_most_sw(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  // assert(0);
  packunpack_param_t pm;
  build_pack_params_sw<get_size_pack_most>(&pm, buf, grid, xlo, xhi, ylo, yhi, zlo, zhi);
  qthread_spawn(slave_pack_most_cpe, &pm);
  qthread_join();
  return pm.total_size;
}
size_t unpack_brick_most_sw(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
  // assert(0);
  packunpack_param_t pm;
  build_pack_params_sw<get_size_unpack_most>(&pm, buf, grid, xlo, xhi, ylo, yhi, zlo, zhi);
  qthread_spawn(slave_unpack_most_cpe, &pm);
  qthread_join();
  return pm.total_size;
}

#endif
#ifdef __sw_slave__
extern "C" {
#include <qthread_slave.h>
#include "dma_macros_new.h"
// #include "hilbert.c"
//#define EVT_PC0 PC0_CYCLE
//#define EVT_PC2 PC2_CNT_GLD
// //#define EVT_PC3 PC3_CNT_GST
#define LWPF_UNIT U(CELL_IE)
#define LWPF_KERNELS \
  K(CELL_EXPORT)     \
  K(CELL_IMPORT)
#include "lwpf3/lwpf.h"
// #define max(a, b) ((a) > (b) ? (a) : (b))
// #define min(a, b) ((a) < (b) ? (a) : (b))
}
#include "dma_funcs.hpp"
#include "field_copy.hpp"
template <const int alignment>
struct BufferPtr {
  private:
  long val;
  char *buf_base;

  public:
  __always_inline BufferPtr(long initial_val = 0, const char *buf = NULL) {
    static_assert((alignment & -alignment) == alignment, "Alignment must be a power of 2");
    val = initial_val;
  }
  __always_inline long get_aligned_val() {
    return (val + alignment - 1) & ~(alignment - 1);
  }
  __always_inline const BufferPtr &operator+=(long o) {
    val = get_aligned_val() + o;
    return *this;
  }
  template <typename T>
  __always_inline T *buf() {
    return (T *)(buf_base + get_aligned_val());
  }
  template <typename T>
  __always_inline void append_size(T *ptr, long n = 1) {
    val = get_aligned_val() + sizeof(*ptr) * n;
  }
};

template <const long cap = 32768, const long alignment = 8>
struct LDMPackBuffer {
  long off;
  void *mem;
  char ldm[cap];
  // DMA_CLASS;
  __always_inline LDMPackBuffer(void *mem_addr = NULL) {
    static_assert((alignment & -alignment) == alignment, "Alignment must be a power of 2");
    static_assert(alignment >= 4, "Alignment must be greater than 4");
    off = 0;
    mem = mem_addr;
    // dma_init_class();
  }
  __always_inline void flush() {
    dma_init();
    pe_put(mem, ldm, off);
    dma_syn();
    mem = (void *)((char *)mem + off);
    off = 0;
  }
  template <typename T>
  __always_inline T *push(int cnt) {
    long size = sizeof(T) * cnt;
    assert(size <= cap);
    if (size + off > cap) {
      flush();
    }
    T *ret = (T *)(ldm + off);
    off += size;
    off += (alignment - 1);
    off &= ~(alignment - 1);
    return ret;
  }
  template <typename T>
  __always_inline T *push_ldm(T *ptr, int count = 1) {
    T *tbuf = push<T>(count);
    for (int i = 0; i < count; i++) {
      field_copy(tbuf + i, ptr + i);
    }
    return tbuf;
  }
  template <typename T>
  __always_inline T *push_mem(T *ptr, int count = 1) {
    T *tbuf = push<T>(count);
    dma_init();
    pe_getn(ptr, tbuf, count);
    dma_syn();
    return tbuf;
  }
};
template <const long cap = 16384, const long alignment = 8>
struct LDMUnpackBuffer {
  long off;
  long rest_size;
  char *mem;
  char ldm[cap];
  // DMA_CLASS;
  __always_inline LDMUnpackBuffer(void *mem_addr = NULL, int maxcnt = 0x7fffffff) {
    static_assert((alignment & -alignment) == alignment, "Alignment must be a power of 2");
    static_assert(alignment >= 4, "Alignment must be greater than 4");
    off = cap;
    mem = (char *)mem_addr;
    rest_size = maxcnt;
    // dma_init_class();
    read();
  }
  template <typename T>
  __always_inline void shift() {
    T *src = (T *)(ldm + off);
    T *dst = (T *)(ldm);
    long n = (cap - off) / sizeof(T);
    for (int i = 0; i < n; i++) {
      dst[i] = src[i];
    }
  }
  __always_inline void flush() {}
  __always_inline void read() {
    if (alignment >= 8) {
      shift<long>();
    } else {
      shift<int>();
    }
    long shift_cnt = cap - off;
    long read_cnt = min(cap - shift_cnt, rest_size);
    rest_size -= read_cnt;
    // reply = 0;
    // dma_size_rpl(get_desc, mem, ldm + shift_cnt, read_cnt, reply);
    // while (reply != 1);
    dma_init();
    pe_get(mem, ldm + shift_cnt, read_cnt);
    dma_syn();
    mem += read_cnt;
    off = 0;
  }
  template <typename T>
  __always_inline T *pop(int cnt) {
    long size = cnt * sizeof(T);
    assert(size <= cap);
    if (off + size > cap) {
      read();
    }

    T *ret = (T *)(ldm + off);
    off += size;
    off += alignment - 1;
    off &= ~(alignment - 1);
    return ret;
  }
  template <typename T>
  __always_inline T *pop_ldm(T *ptr, int count = 1) {
    T *tbuf = pop<T>(count);
    for (int i = 0; i < count; i++) {
      field_copy(ptr + i, tbuf + i);
    }
    // printf("%d\n", off);
    return tbuf;
  }
  template <typename T>
  __always_inline T *pop_mem(T *ptr, int count = 1) {
    T *tbuf = pop<T>(count);
    dma_init();
    pe_putn(ptr, tbuf, count);
    dma_syn();
    // printf("%d\n", off);
    return tbuf;
  }
};
template <typename T>
void filter_export_field(T *mem_field, int *is_stay, int natom, int export_start) {
  // dma_init();
  T field[CELL_CAP];
  dma_getn(mem_field, field, natom);
  // dma_syn();
  int stay_ptr = 0;
  int export_ptr = export_start;
  for (int i = 0; i < natom; i++) {
    if (is_stay[i]) {
      field_copy(field + stay_ptr, field + i);
      stay_ptr++;
    } else {
      field_copy(field + export_ptr, field + i);
      export_ptr++;
    }
  }
  dma_putn(mem_field, field, CELL_CAP);
  // dma_syn();
}
template <typename T>
void filter_export_ldm_field(T *field, int *is_stay, int natom, int export_start) {
  int stay_ptr = 0;
  int export_ptr = export_start;
  for (int i = 0; i < natom; i++) {
    if (is_stay[i]) {
      field_copy(field + stay_ptr, field + i);
      stay_ptr++;
    } else {
      field_copy(field + export_ptr, field + i);
      export_ptr++;
    }
  }
}

template <typename T>
int filter_export_list(T *mem_list, int *mem_first, int *is_stay, int natom, int export_start, const int max_cnt) {
  // dma_init();

  int first[CELL_CAP + 1];
  T list[max_cnt];
  dma_getn(mem_first, first, natom + 1);
  // dma_syn();
  dma_getn(mem_list, list, first[natom]);
  // dma_syn();

  int nexport = 0;
  for (int i = 0; i < natom; i++) {
    if (!is_stay[i]) {
      nexport += first[i + 1] - first[i];
    }
  }
  int list_stay_ptr = 0, list_export_ptr = max_cnt - nexport;
  int export_ptr = export_start, stay_ptr = 0;
  for (int i = 0; i < natom; i++) {
    if (is_stay[i]) {
      first[stay_ptr] = list_stay_ptr;
      for (int jj = first[i]; jj < first[i + 1]; jj++) {
        field_copy(list + list_stay_ptr, list + jj);
        list_stay_ptr++;
      }
      stay_ptr++;
    } else {
      first[export_ptr] = list_export_ptr;
      for (int jj = first[i]; jj < first[i + 1]; jj++) {
        field_copy(list + list_export_ptr, list + jj);
        list_export_ptr++;
      }
      export_ptr++;
    }
  }
  first[stay_ptr] = list_stay_ptr;
  first[export_ptr] = list_export_ptr;
  dma_putn(mem_first, first, CELL_CAP + 1);
  dma_putn(mem_list, list, list_stay_ptr);
  dma_putn(mem_list + max_cnt - nexport, list + max_cnt - nexport, nexport);
  // dma_syn();
  return nexport;
}
#define NPE_EXPORT 64
void cell_export_cpe(cellgrid_t *grid) {

  lwpf_enter(CELL_IE);
  lwpf_start(CELL_EXPORT);
  // dma_init();
  cellgrid_t lgrid;
  dma_getn(grid, &lgrid, 1);
  // dma_syn();
  int tot_bonded_export = 0, tot_chain2_export = 0, tot_scal_export = 0, tot_excl_export = 0, tot_impr_export = 0;
  FOREACH_LOCAL_CELL_CPE_RR(&lgrid, ii, jj, kk, cell) {
    int natom = cell->natom;
    int is_stay[CELL_CAP];
    // int nbonded_export = 0, nchain2_export = 0, nscal_export = 0, nexcl_export = 0, natom_export = 0, nimpr_export = 0;
    int natom_export = 0;
    vec<real> x[CELL_CAP];
    dma_getn(cell->x, x, natom);
    // dma_syn();
    for (int i = 0; i < natom; i++) {
      vec<real> delhi;
      vecsubv(delhi, lgrid.len, x[i]);

      real del_min = min(min3(x[i].x, x[i].y, x[i].z), min3(delhi.x, delhi.y, delhi.z) + 1e-8);
      if (del_min < 0) {
        natom_export++;
        is_stay[i] = 0;
      } else {
        is_stay[i] = 1;
      }
    }

    int stay_ptr = 0;
    int export_ptr = CELL_CAP - natom_export;

    filter_export_ldm_field(x, is_stay, natom, export_ptr);
    dma_putn(cell->x, x, CELL_CAP);
    // dma_syn();

    filter_export_field(cell->v, is_stay, natom, export_ptr);
    filter_export_field(cell->xinit, is_stay, natom, export_ptr);
    filter_export_field(cell->shake_tmp, is_stay, natom, export_ptr);
    filter_export_field(cell->q, is_stay, natom, export_ptr);
    filter_export_field(cell->tag, is_stay, natom, export_ptr);
    filter_export_field(cell->mol, is_stay, natom, export_ptr);
    filter_export_field(cell->t, is_stay, natom, export_ptr);
    filter_export_field(cell->mass, is_stay, natom, export_ptr);
    filter_export_field(cell->rigid, is_stay, natom, export_ptr);

    BufferPtr<8> byte_cnt;
    byte_cnt += 8; //nexport
    byte_cnt += 8; //nbytes_export
    byte_cnt.append_size(cell->x, natom_export);
    byte_cnt.append_size(cell->v, natom_export);
    byte_cnt.append_size(cell->xinit, natom_export);
    byte_cnt.append_size(cell->shake_tmp, natom_export);
    byte_cnt.append_size(cell->q, natom_export);
    byte_cnt.append_size(cell->tag, natom_export);
    byte_cnt.append_size(cell->mol, natom_export);
    byte_cnt.append_size(cell->t, natom_export);
    byte_cnt.append_size(cell->mass, natom_export);
    byte_cnt.append_size(cell->rigid, natom_export);
    /* data layout */
    cell->nbytes_export = byte_cnt.get_aligned_val();
    cell->natom = natom - natom_export;
    cell->nexport = natom_export;
  }

  lwpf_stop(CELL_EXPORT);
  lwpf_exit(CELL_IE);
}

#define SUB_CELL 4
template <typename T>
void import_field(T *mem_center, T *mem_neighbor, int *is_come, int nexport) {
  // dma_init();
  T field[CELL_CAP];
  dma_getn(mem_neighbor, field, nexport);
  // dma_syn();
  int nimport = 0;
  for (int i = 0; i < nexport; i++) {
    if (is_come[i]) {
      field_copy(field + nimport, field + i);
      nimport++;
    }
  }
  dma_putn(mem_center, field, nimport);
  // dma_syn();
}
void import_x(vec<real> * mem_center, vec<real> * mem_neighbor, int *is_come, int nexport, const vec<real> & offset) {
  vec<real> field[CELL_CAP];
  dma_getn(mem_neighbor, field, nexport);

  int nimport = 0;
  for (int i = 0; i < nexport; i++) {
    if (is_come[i]) {
      vecaddv(field[nimport], field[i], offset);
      nimport++;
    }
  }
  dma_putn(mem_center, field, nimport);
}
template <typename T>
void import_list(T *mem_list_center, T *mem_list_neighbor, int *mem_first_center, int *mem_first_neighbor, int *is_come, int &import_list_ptr, int nexport, const int max_cnt) {
  int first[CELL_CAP];
  dma_getn(mem_first_neighbor, first, nexport + 1);

  T list[max_cnt];
  dma_getn(mem_list_neighbor + first[0], list, first[nexport] - first[0]);

  int offset = first[0];
  for (int i = 0; i <= nexport; i++) {
    first[i] -= offset;
  }

  int nimport_list = 0, nimport_atom = 0;
  for (int i = 0; i < nexport; i++) {
    if (is_come[i]) {
      first[nimport_atom] = nimport_list;
      for (int jj = first[i]; jj < first[i + 1]; jj++) {
        field_copy(list + nimport_list, list + jj);
        nimport_list++;
      }
      nimport_atom++;
    }
  }

  first[nimport_atom] = nimport_list;
  for (int i = 0; i <= nimport_atom; i++) {
    first[i] += import_list_ptr;
  }

  dma_putn(mem_first_center, first, nimport_atom + 1);
  dma_putn(mem_list_center + import_list_ptr, list, nimport_list);

  import_list_ptr += nimport_list;
}
template <typename T>
void sort_field(T *mem_field, int *reorder, int natom) {
  T field_sort[CELL_CAP], field[CELL_CAP];
  dma_getn(mem_field, field, natom);
  for (int i = 0; i < natom; i++) {
    field_copy(field_sort + i, field + reorder[i]);
  }
  dma_putn(mem_field, field_sort, natom);
}
template <typename T>
void sort_field_src_ldm(T *mem_field, T *field, int *reorder, int natom) {
  T field_sort[CELL_CAP];
  for (int i = 0; i < natom; i++) {
    field_copy(field_sort + i, field + reorder[i]);
  }
  dma_putn(mem_field, field_sort, natom);
}
template <typename T>
void sort_field_dst_ldm(T *mem_field, T *field_sort, int *reorder, int natom) {
  T field[CELL_CAP];
  dma_getn(mem_field, field, natom);
  for (int i = 0; i < natom; i++) {
    field_copy(field_sort + i, field + reorder[i]);
  }
}
template <typename T>
void sort_list(T *mem_list, int *mem_first, int *reorder, int natom, const int max_cnt) {
  T list_sort[max_cnt], list[max_cnt];
  int first_sort[CELL_CAP + 1], first[CELL_CAP + 1];
  dma_getn(mem_first, first, natom + 1);
  dma_getn(mem_list, list, first[natom]);
  int nlist = 0;
  for (int i = 0; i < natom; i++) {
    first_sort[i] = nlist;
    int isrc = reorder[i];
    for (int jj = first[isrc]; jj < first[isrc + 1]; jj++) {
      field_copy(list_sort + nlist, list + jj);
      nlist++;
    }
  }
  first_sort[natom] = nlist;
  dma_putn(mem_first, first_sort, natom + 1);
  dma_putn(mem_list, list_sort, nlist);
}
#define SUBCELL 4
#include "hilbert.c"

void cell_import_cpe(cellgrid_t *grid) {
  lwpf_enter(CELL_IE);
  lwpf_start(CELL_IMPORT);

  int hib[SUBCELL][SUBCELL][SUBCELL];
  gen_hilbert((int *)hib, SUBCELL);
  cellgrid_t lgrid;
  dma_getn(grid, &lgrid, 1);

  int total_n_atom = 0;
  int tot_bonded_import = 0, tot_chain2_import = 0, tot_excl_import = 0, tot_scal_import = 0, tot_impr_import = 0;
  FOREACH_LOCAL_CELL_CPE_RR(&lgrid, ii, jj, kk, icell) {
    cellmeta_t imeta;
    dma_getn(&icell->basis, &imeta, 1);
    int nimport = 0;
    int natom = imeta.natom;
    FOREACH_NEIGHBOR(&lgrid, ii, jj, kk, dx, dy, dz, jcell) {
      cellmeta_t jmeta;
      dma_getn(&jcell->basis, &jmeta, 1);
      // dma_syn();
      int nexport = jmeta.nexport;
      int is_come[CELL_CAP];
      int export_start = CELL_CAP - nexport;
      int ncome = 0;
      vec<real> diff_basis;
      vecsubv(diff_basis, jmeta.basis, imeta.basis);
      int import_ptr = natom + nimport;
      vec<real> xj[CELL_CAP];
      dma_getn(jcell->x + export_start, xj + export_start, nexport);
      // dma_syn();
      for (int j = export_start; j < CELL_CAP; j++) {

        vec<real> jx;
        vecaddv(jx, xj[j], diff_basis);
        vec<real> delhi;
        vecsubv(delhi, lgrid.len, jx);
        real del_min = min(min3(jx.x, jx.y, jx.z), min3(delhi.x, delhi.y, delhi.z) - 1e-8);

        if (del_min >= 0) {
          is_come[j - export_start] = 1;
          ncome += 1;
          nimport++;
        } else {
          is_come[j - export_start] = 0;
        }
      }
      if (ncome > 0) {
        import_x(icell->x + import_ptr, jcell->x + export_start, is_come, nexport, diff_basis);
        import_field(icell->v + import_ptr, jcell->v + export_start, is_come, nexport);
        import_field(icell->xinit + import_ptr, jcell->xinit + export_start, is_come, nexport);
        import_field(icell->shake_tmp + import_ptr, jcell->shake_tmp + export_start, is_come, nexport);
        import_field(icell->q + import_ptr, jcell->q + export_start, is_come, nexport);
        import_field(icell->tag + import_ptr, jcell->tag + export_start, is_come, nexport);
        import_field(icell->mol + import_ptr, jcell->mol + export_start, is_come, nexport);
        import_field(icell->t + import_ptr, jcell->t + export_start, is_come, nexport);
        import_field(icell->mass + import_ptr, jcell->mass + export_start, is_come, nexport);
        import_field(icell->rigid + import_ptr, jcell->rigid + export_start, is_come, nexport);
      }
    }

    natom += nimport;
    icell->natom = natom;
    vec<real> x[CELL_CAP];
    dma_getn(icell->x, x, natom);
    int bkt[CELL_CAP];
    int head[SUB_CELL * SUB_CELL * SUB_CELL];
    for (int i = 0; i < SUB_CELL * SUB_CELL * SUB_CELL; i++) {
      head[i] = 0;
    }
    for (int i = 0; i < icell->natom; i++) {
      int subx = max(min((int)(x[i].x * lgrid.rlen.x * SUB_CELL), SUB_CELL), 0);
      int suby = max(min((int)(x[i].y * lgrid.rlen.y * SUB_CELL), SUB_CELL), 0);
      int subz = max(min((int)(x[i].z * lgrid.rlen.z * SUB_CELL), SUB_CELL), 0);
      bkt[i] = hib[subx][suby][subz];
      head[bkt[i]]++;
    }
    // puts("");
    for (int i = 1; i < SUB_CELL * SUB_CELL * SUB_CELL; i++) {
      head[i] += head[i - 1];
    }
    // puts("");

    for (int i = SUB_CELL * SUB_CELL * SUB_CELL - 1; i > 0; i--) {
      head[i] = head[i - 1];
    }
    head[0] = 0;
    int reorder[CELL_CAP];
    for (int i = 0; i < icell->natom; i++) {
      reorder[head[bkt[i]]++] = i;
    }

    real mass[CELL_CAP], rmass[CELL_CAP];
    // dma_getn(icell->mass, mass, natom);
    sort_field_src_ldm(icell->x, x, reorder, natom);
    // sort_field(icell->x, reorder, natom);
    sort_field(icell->v, reorder, natom);
    sort_field(icell->xinit, reorder, natom);
    sort_field(icell->shake_tmp, reorder, natom);
    sort_field(icell->q, reorder, natom);
    sort_field(icell->tag, reorder, natom);
    sort_field(icell->mol, reorder, natom);
    sort_field(icell->t, reorder, natom);
    // sort_field(icell->mass, reorder, natom);
    sort_field_dst_ldm(icell->mass, mass, reorder, natom);
    sort_field(icell->rigid, reorder, natom);

    for (int i = 0; i < natom; i++) {
      rmass[i] = 1.0 / mass[i];
    }
    dma_putn(icell->mass, mass, natom);
    dma_putn(icell->rmass, rmass, natom);
  }
  lwpf_stop(CELL_IMPORT);
  lwpf_exit(CELL_IE);
}
template <typename T>
void pack_cell_export(T &lbuf, celldata_t *cell) {
  dma_init();
  cellmeta_t meta;
  pe_getn(&cell->basis, &meta, 1);
  dma_syn();
  lbuf.push_ldm(&meta.nexport);
  lbuf.push_ldm(&meta.nbytes_export);
  int export_start = CELL_CAP - meta.nexport;
  lbuf.push_mem(cell->x + export_start, meta.nexport);
  lbuf.push_mem(cell->v + export_start, meta.nexport);
  lbuf.push_mem(cell->xinit + export_start, meta.nexport);
  lbuf.push_mem(cell->shake_tmp + export_start, meta.nexport);
  lbuf.push_mem(cell->q + export_start, meta.nexport);
  lbuf.push_mem(cell->tag + export_start, meta.nexport);
  lbuf.push_mem(cell->mol + export_start, meta.nexport);
  lbuf.push_mem(cell->t + export_start, meta.nexport);
  lbuf.push_mem(cell->mass + export_start, meta.nexport);
  lbuf.push_mem(cell->rigid + export_start, meta.nexport);

  lbuf.flush();
}
template <class T>
void unpack_cell_export(T &lbuf, celldata_t *cell) {
  dma_init();
  cellmeta_t meta;
  lbuf.pop_ldm(&meta.nexport);
  lbuf.pop_ldm(&meta.nbytes_export);

  cell->nexport = meta.nexport;
  cell->nbytes_export = meta.nbytes_export;
  int export_start = CELL_CAP - meta.nexport;
  lbuf.pop_mem(cell->x + export_start, meta.nexport);
  lbuf.pop_mem(cell->v + export_start, meta.nexport);
  lbuf.pop_mem(cell->xinit + export_start, meta.nexport);
  lbuf.pop_mem(cell->shake_tmp + export_start, meta.nexport);
  lbuf.pop_mem(cell->q + export_start, meta.nexport);
  lbuf.pop_mem(cell->tag + export_start, meta.nexport);
  lbuf.pop_mem(cell->mol + export_start, meta.nexport);
  lbuf.pop_mem(cell->t + export_start, meta.nexport);
  lbuf.pop_mem(cell->mass + export_start, meta.nexport);
  lbuf.pop_mem(cell->rigid + export_start, meta.nexport);

}
template <typename T>
void pack_cell_most(T &lbuf, celldata_t *cell) {
  dma_init();
  cellmeta_t meta;
  pe_getn(&cell->basis, &meta, 1);
  dma_syn();
  lbuf.push_ldm(&meta.natom);
  lbuf.push_mem(cell->tag, meta.natom);
  lbuf.push_mem(cell->x, meta.natom);
  lbuf.push_mem(cell->q, meta.natom);
  lbuf.push_mem(cell->t, meta.natom);
  lbuf.push_mem(cell->mass, meta.natom);
  lbuf.push_mem(cell->rmass, meta.natom);
}
template <typename T>
void unpack_cell_most(T &lbuf, celldata_t *cell) {
  dma_init();
  cellmeta_t meta;
  lbuf.pop_ldm(&meta.natom);
  cell->natom = meta.natom;
  lbuf.pop_mem(cell->tag, meta.natom);
  lbuf.pop_mem(cell->x, meta.natom);
  lbuf.pop_mem(cell->q, meta.natom);
  lbuf.pop_mem(cell->t, meta.natom);
  lbuf.pop_mem(cell->mass, meta.natom);
  lbuf.pop_mem(cell->rmass, meta.natom);
}
template <class Buffer, void (*pack_cell)(Buffer &, celldata_t *)>
void pack_unpack_brick_cpe(packunpack_param_t *pm) {
  int altid = _COL * 8 + _ROW;
  if (altid >= NPE_PACK)
    return;
  dma_init();
  packunpack_param_t lpm;
  pe_getn(pm, &lpm, 1);
  dma_syn();
  cellgrid_t lgrid;
  pe_getn(lpm.grid, &lgrid, 1);
  dma_syn();

  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;

  int nall = xlen * ylen * zlen;
  int npack_pe = (nall + NPE_PACK - 1) / NPE_PACK;
  int pack_st = npack_pe * altid;
  int pack_ed = min(npack_pe * (altid + 1), nall);

  if (pack_st >= pack_ed)
    return;
  Buffer lbuf((void *)((char *)lpm.buf + lpm.offset[altid]));
  for (int i = lpm.xlo; i < lpm.xhi; i++) {
    for (int j = lpm.ylo; j < lpm.yhi; j++) {
      for (int k = lpm.zlo; k < lpm.zhi; k++) {
        int idx = ((i - lpm.xlo) * ylen + j - lpm.ylo) * zlen + k - lpm.zlo;
        if (idx < pack_st || idx >= pack_ed)
          continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        pack_cell(lbuf, cell);
      }
    }
  }
  lbuf.flush();
}
void pack_export_cpe(packunpack_param_t *pm) {
  pack_unpack_brick_cpe<LDMPackBuffer<>, pack_cell_export>(pm);
}
void unpack_export_cpe(packunpack_param_t *pm) {
  pack_unpack_brick_cpe<LDMUnpackBuffer<32768>, unpack_cell_export>(pm);
}
void pack_most_cpe(packunpack_param_t *pm) {
  pack_unpack_brick_cpe<LDMPackBuffer<>, pack_cell_most>(pm);
}
void unpack_most_cpe(packunpack_param_t *pm) {
  pack_unpack_brick_cpe<LDMUnpackBuffer<4096>, unpack_cell_most>(pm);
}
#endif
